from typing import Dict
from datasets import load_dataset, Dataset
import re

def split_into_sentences(text: str) -> list:
    """
    Split text into sentences while preserving punctuation.
    Handles multiple newlines by converting them to single newlines.
    
    Args:
        text (str): Input text to split
        
    Returns:
        list: List of sentences
    """
    # First normalize newlines (convert multiple newlines to single newline)
    text = re.sub(r'\n+', '\n', text)
    
    # Split on sentence endings followed by space or newline
    sentences = re.split(r'([.!?])\s+', text)
    
    # Combine the punctuation with the previous sentence
    result = []
    for i in range(0, len(sentences)-1, 2):
        if i+1 < len(sentences):
            result.append(sentences[i] + sentences[i+1])
        else:
            result.append(sentences[i])
    
    # Clean up any remaining whitespace
    result = [s.strip() for s in result if s.strip()]
    
    return result

class CNNDataset(Dataset):
    def __init__(self, split="test"):
        """
        Initialize CNN/DailyMail Dataset.
        
        Args:
            split (str): Dataset split to load ('train', 'validation', 'test')
        """
        self.dataset = load_dataset("abisee/cnn_dailymail", "1.0.0", split=split)
        
    def __len__(self) -> int:
        """Return the number of items in the dataset."""
        return len(self.dataset)
    
    def __getitem__(self, idx) -> Dict:
        """
        Get a single item from the dataset.
        
        Args:
            idx (int): Index of the item
            
        Returns:
            Dict: Dictionary containing the article sentences and highlights
        """
        item = self.dataset[idx]
        article = item['article']
        highlights = item['highlights']
        
        # Split article into sentences
        sentences = split_into_sentences(article)
        
        return {"sentences": sentences, "highlights": highlights}
